{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from  scipy import stats\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we will use this function for statistical significance testing\n",
    "# stats.ttest_ind_from_stats()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('krippendorff_alpha_prompt.csv', index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>ci_lower</th>\n",
       "      <th>ci_upper</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>prompt</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>None_None</th>\n",
       "      <td>0.482314</td>\n",
       "      <td>0.444359</td>\n",
       "      <td>0.520270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>None_kojima-01</th>\n",
       "      <td>0.501504</td>\n",
       "      <td>0.463440</td>\n",
       "      <td>0.539568</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>None_zhou-01</th>\n",
       "      <td>0.546853</td>\n",
       "      <td>0.509682</td>\n",
       "      <td>0.584023</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>qa-10_None</th>\n",
       "      <td>0.495835</td>\n",
       "      <td>0.454311</td>\n",
       "      <td>0.537359</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>qa-12_None</th>\n",
       "      <td>0.527846</td>\n",
       "      <td>0.486854</td>\n",
       "      <td>0.568838</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>qa-13_None</th>\n",
       "      <td>0.535269</td>\n",
       "      <td>0.496902</td>\n",
       "      <td>0.573636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>qa-16_None</th>\n",
       "      <td>0.526286</td>\n",
       "      <td>0.485367</td>\n",
       "      <td>0.567205</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>qa-17_None</th>\n",
       "      <td>0.482364</td>\n",
       "      <td>0.442897</td>\n",
       "      <td>0.521832</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>refl-01_None</th>\n",
       "      <td>0.474420</td>\n",
       "      <td>0.434838</td>\n",
       "      <td>0.514003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zhou-01-ins_None</th>\n",
       "      <td>0.505230</td>\n",
       "      <td>0.466029</td>\n",
       "      <td>0.544432</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      mean  ci_lower  ci_upper\n",
       "prompt                                        \n",
       "None_None         0.482314  0.444359  0.520270\n",
       "None_kojima-01    0.501504  0.463440  0.539568\n",
       "None_zhou-01      0.546853  0.509682  0.584023\n",
       "qa-10_None        0.495835  0.454311  0.537359\n",
       "qa-12_None        0.527846  0.486854  0.568838\n",
       "qa-13_None        0.535269  0.496902  0.573636\n",
       "qa-16_None        0.526286  0.485367  0.567205\n",
       "qa-17_None        0.482364  0.442897  0.521832\n",
       "refl-01_None      0.474420  0.434838  0.514003\n",
       "zhou-01-ins_None  0.505230  0.466029  0.544432"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# our std are wrong\n",
    "df.drop(columns='std', inplace=True)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def p_value_from_confidence_intervals(mean1, lower_ci1, upper_ci1, n1, mean2, lower_ci2, upper_ci2, n2):\n",
    "    # Calculate standard errors from confidence intervals\n",
    "    se1 = (upper_ci1 - lower_ci1) / (2 * stats.t.ppf((1 + 0.95) / 2, n1 - 1))\n",
    "    se2 = (upper_ci2 - lower_ci2) / (2 * stats.t.ppf((1 + 0.95) / 2, n2 - 1))\n",
    "\n",
    "    std1 = se1 * np.sqrt(n1)\n",
    "    std2 = se2 * np.sqrt(n2)\n",
    "\n",
    "    # Perform two-sample t-test (Welch's t-test) to obtain the t-statistic and p-value\n",
    "    t_stat, p_value = stats.ttest_ind_from_stats(mean1, std1, n1, mean2, std2, n2, equal_var=False)\n",
    "\n",
    "    return p_value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_1 = \"None_None\"\n",
    "dataset_2 = \"None_zhou-01\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get the data\n",
    "m1, ci_lower1, ci_upper1 = df.loc[dataset_1]\n",
    "m2, ci_lower2, ci_upper2 = df.loc[dataset_2]\n",
    "n1 = 11800/len(df)\n",
    "n2 = 11800/len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.017227607178862517"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p_value_from_confidence_intervals(m1, ci_lower1, ci_upper1, n1, m2, ci_lower2, ci_upper2, n2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_1 = \"zhou-01-ins_None\"\n",
    "dataset_2 = \"None_zhou-01\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get the data\n",
    "m1, ci_lower1, ci_upper1 = df.loc[dataset_1]\n",
    "m2, ci_lower2, ci_upper2 = df.loc[dataset_2]\n",
    "n1 = 11800/len(df)\n",
    "n2 = 11800/len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.1307622066585317"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p_value_from_confidence_intervals(m1, ci_lower1, ci_upper1, n1, m2, ci_lower2, ci_upper2, n2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_1 = \"None_zhou-01\"\n",
    "dataset_2 = \"None_kojima-01\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get the data\n",
    "m1, ci_lower1, ci_upper1 = df.loc[dataset_1]\n",
    "m2, ci_lower2, ci_upper2 = df.loc[dataset_2]\n",
    "n1 = 11800/len(df)\n",
    "n2 = 11800/len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0945855920110885"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p_value_from_confidence_intervals(m1, ci_lower1, ci_upper1, n1, m2, ci_lower2, ci_upper2, n2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
